# loading packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.4.4
## Loading required package: ggplot2
library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.4.4
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:ggmap':
##
## inset
library(ggdendro)
## Warning: package 'ggdendro' was built under R version 3.4.4
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.4
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(wesanderson)
## Warning: package 'wesanderson' was built under R version 3.4.4
# read-in data
trails = read.csv('Trails.csv')
trails1 = select(trails, PMA_NAME, TRAIL_ID, PMAID, TRAIL_NUM,
SURFACE_TY, CONDITION, WIDTH, CANOPY, GRADE_TYPE,
GRADE_PERC, SHAPE_LENG)
# sort by 'PAMID'--> 'Park_ID' and 'TRAIL_NUM'-->'individual id for
# each trail in the park'
trails1 = arrange(trails1, PMAID, TRAIL_NUM)
colnames(trails1)[2] = 'ID'; colnames(trails1)[3] = 'Park_ID';
colnames(trails1)[4] = 'Trail_ID'
trails2 = trails1 # get a copy of trails1 just in case
# data summary
summary(trails1) # note something with 'WIDTH'
## PMA_NAME ID Park_ID
## DISCOVERY PARK : 245 404-11 : 3 Min. : 234.0
## WARREN G. MAGNUSON PARK : 216 416-8 : 3 1st Qu.: 310.0
## WESTCREST PARK : 202 282-29 : 2 Median : 398.0
## WASHINGTON PARK AND ARBORETUM: 182 310-363: 2 Mean : 694.7
## LINCOLN PARK : 159 327-10 : 2 3rd Qu.: 460.0
## CAMP LONG : 125 347-14 : 2 Max. :4479.0
## (Other) :1495 (Other):2610
## Trail_ID SURFACE_TY CONDITION WIDTH
## Min. : 1.00 Gravel :1213 Eroded : 70 Min. : 1.00
## 1st Qu.: 13.00 Soil : 615 Good :1443 1st Qu.: 3.00
## Median : 40.00 Concrete: 275 Overgrown: 67 Median : 4.00
## Mean : 70.64 Stairs : 190 Poor : 128 Mean : 4.24
## 3rd Qu.:103.00 Asphalt : 145 Worn : 916 3rd Qu.: 5.00
## Max. :391.00 Grass : 63 Max. :103.00
## (Other) : 123
## CANOPY GRADE_TYPE GRADE_PERC SHAPE_LENG
## Closed: 330 Flat :739 Min. : 0.000 Min. : 2.516
## High : 747 Gradual :986 1st Qu.: 3.000 1st Qu.: 88.489
## Low : 323 Moderate:608 Median : 5.000 Median : 177.548
## Open :1224 Steep :291 Mean : 4.714 Mean : 279.527
## 3rd Qu.: 7.000 3rd Qu.: 344.706
## Max. :20.000 Max. :4584.925
##
# check the numeric 'WIDTH' distribution
ggplot(trails1, aes(x=WIDTH)) + geom_bar()
# obs with WIDTH=103 might be outlier
# remove outlier obs
trails1 = filter(trails1, WIDTH != 103)
After this simple data summary, I came up with some questions:
Questions1: What is the variable ‘GRADE_PER’? Seems to be closely related to ‘GRADE_TYPE’. I could not find complete metadata from the data source: City of Seattle (Seattle.gov), so I could not get offical explanation on this variable.
Question2: Are there any relationships among the trail data attributes (‘SURFACE_TYPE’, ‘CONDITION’,‘WIDTH’, ‘CANOPY’, ‘GRADE_TYPE’ and ‘GRADE_PERC’) for different city parks in Seattle? Particularly, which ones have relatively stronger relationships with trail condition?
Question3: Any parks share common trail features? Are they close to each other? What are their features? Are these information valuable for future city planning?
Find trail attributes representatives for each park and use these information to do hierarchical clustering. This may provide some insights on how parks are different with respect to their trails.
Do a ‘map matrix’ for interesting trail attributes based on the PCA results from step1.
# Check 'Grade_type' levels and transform to ordinal numbers
# Flat--1, gradual--2, moderate--3, steep--4
levels(trails1$GRADE_TYPE)
## [1] "Flat" "Gradual" "Moderate" "Steep"
levels(trails1$GRADE_TYPE) = c(1,2,3,4)
# Check 'Canopy' levels and transform to ordinal numbers
# Closed--4, high--3, low--2, open--1
levels(trails1$CANOPY)
## [1] "Closed" "High" "Low" "Open"
levels(trails1$CANOPY) = c(4,3,2,1)
# Check 'Condition' levels and transform to ordinal numbers
# Worn and erode -- 1, good--4, overgrown--3, poor--2
levels(trails1$CONDITION)
## [1] "Eroded" "Good" "Overgrown" "Poor" "Worn"
levels(trails1$CONDITION) = c(1, 4, 3, 2, 1)
# Check 'Surface type' but found no clue for giving different categorical
# levels an order and transform them into a Likert Scale
# Try text clustering
levels(trails1$SURFACE_TY)
## [1] "Asphalt" "Bark" "Boardwalk" "Bridge" "Check Steps"
## [6] "Concrete" "Grass" "Gravel" "Soil" "Stairs"
# Vector Space Modeling
# Code modified from the code provided by instructor during the class
# I couldn't find proper document to describe grass-type trail so I would
# intuitively group it with soil as both of them sound close to nature
d1 = scan("asphalt.txt", what = character())
d2 = scan("bark.txt", what = character())
d3 = scan("bridge.txt", what = character())
d4 = scan("broadwalk.txt", what = character())
d5 = scan("check_step.txt", what = character())
d6 = scan("concrete.txt", what = character())
d7 = scan("gravel.txt", what = character())
d8 = scan("soil.txt", what = character())
d9 = scan("step.txt", what = character())
# stop words to be removed from the text documents
# I remove the terms appeared in surface type names as well
# because 1. lots of ducuments seem to compare specific surface types
# 2. I am more interested in using other surface type properties to
# classify these surface types rather than using their names
rm.words = c("the", "The", "of", "it", "A", "a", "and", "to", "or",
"in", "on", "by", "as", "if", "for", "is", "are", "It",
"from", "that", "be", "do", "also", "an", "An", "with",
"asphalt", "bark", "bridge", "broadwalk", "step",
"concrete", "gravel", "soil")
# TRUE indicates elements of x not present in table
"%not.in%" = function(x, tab) match(x, tab, nomatch = 0) == 0
## pass in a document, get back cleaned document
clean.doc = function(d) {
## remove punctuation and numbers
d = gsub("[[:punct:]]+", "", d)
d = gsub("[[:digit:]]+", "", d)
d = d[ d != "" ] # remove empty strings
d = d[ d %not.in% rm.words ] # remove stop words
tolower(d) # convert to lower-case
}
d1 = clean.doc(d1)
d2 = clean.doc(d2)
d3 = clean.doc(d3)
d4 = clean.doc(d4)
d5 = clean.doc(d5)
d6 = clean.doc(d6)
d7 = clean.doc(d7)
d8 = clean.doc(d8)
d9 = clean.doc(d9)
## Unique terms from all documents
ds = paste("d", 1:9, sep="")
terms = unique(c(d1,d2,d3,d4,d5,d6,d7,d8,d9))
nterms = length(terms) # number of terms
nds = 9 # number of documents
## For this specific document, get occurrence frequency of each term
get.count = function(d) {
cnt = numeric(nterms) # will hold the counts
for (i in 1:nterms) { # for each term
for (t in d) { # for each term in doc
if (t == terms[i]) {
cnt[i] = cnt[i] + 1
}
}
}
cnt
}
## tf values (the number of times a term appears in a documet)
tf1 = get.count(d1)
tf2 = get.count(d2)
tf3 = get.count(d3)
tf4 = get.count(d4)
tf5 = get.count(d5)
tf6 = get.count(d6)
tf7 = get.count(d7)
tf8 = get.count(d8)
tf9 = get.count(d9)
tfs = cbind(tf1, tf2, tf3, tf4, tf5, tf6, tf7, tf8, tf9)
dimnames(tfs) = list(terms, ds)
## the document frequency values
# (the number of documents that contain a term)
dfs = apply(tfs, 1, function(x) sum(x>0))
## compute the idf(inverse document frequency) values
idf = log(nds/dfs, base=10)
## compute the tf-idf weights, i.e. the term-document matrix
td.mat = apply(tfs, 2, function(x) x*idf)
## cosine similarity
cos.simi = function(x,y) (x%*%y)/(sqrt(sum(x^2)*sum(y^2)))
## compute the similarities between all pairs of docs
S.mat = matrix(0, nrow=nds, ncol=nds) # will hold the similarities
for (i in 1:nds) {
for (j in 1:nds) {
x = td.mat[,i]
y = td.mat[,j]
S.mat[i,j] = cos.simi(x,y)
}
}
# append the names back
ds1 = c("asphalt", "bark", "bridge", "broadwalk", "check_step",
"concrete", "gravel", "soil", "stairs")
dimnames(S.mat) = list(ds1, ds1)
D = as.dist(1 - S.mat)
## agglomerative hierarchical clustering
hc = hclust(D, method="average")
plot(hc)
hc1 = hclust(D, method = "complete")
plot(hc1)
Choose to use complete linkage and the results are:
Cluster1: Soil, grass (natural earth, soft, cheaper, level of human impact: 1)
Cluster2: bark, broadwalk, check step, stairs (neutral, relatively soft, smaller constructions, level of human impact: 2)
Cluster3: bridge, asphalt, concrete, gravel (hard materials, larger constructions, least natural earth to look at, level of human impact: 3)
# transform the 'Surface_ty' to ordinal
# according to the above clustering results
levels(trails1$SURFACE_TY)
## [1] "Asphalt" "Bark" "Boardwalk" "Bridge" "Check Steps"
## [6] "Concrete" "Grass" "Gravel" "Soil" "Stairs"
levels(trails1$SURFACE_TY) = c(3, 2, 2, 3, 2, 3, 1, 3, 1, 2)
trails1[,5:10] = apply(trails1[,5:10], 2, as.numeric)
# Run PCA
pc.trail = prcomp(trails1[,5:11], scale = TRUE,center=TRUE)
summary(pc.trail)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5144 1.1955 1.0453 0.9796 0.76487 0.69924 0.3888
## Proportion of Variance 0.3276 0.2042 0.1561 0.1371 0.08357 0.06985 0.0216
## Cumulative Proportion 0.3276 0.5318 0.6879 0.8250 0.90856 0.97840 1.0000
# Proportion of Variance Explained (PVE) code from text book
pve =100* pc.trail$sdev ^2/ sum(pc.trail$sdev ^2)
par(mfrow =c(1,2))
plot(pve, type ="o", ylab="PVE", xlab="Principal Component",
col ="red")
plot(cumsum (pve), type="o", ylab =" Cumulative PVE", xlab="
Principal Component", col = "darkblue")
# It turned out the first three principal components are adequate
# to explain about 70% of the variation in the data.
# First two principal components can help explain over 50% of the variation.
# Loading vectors for the first three principal components
pc.trail$rotation[,1:3]
## PC1 PC2 PC3
## SURFACE_TY -0.2664147 0.54985242 0.15028293
## CONDITION -0.3003709 -0.08559298 -0.69801083
## WIDTH -0.2477839 0.64937712 0.05105992
## CANOPY 0.4085213 0.07555024 0.49821832
## GRADE_TYPE 0.5558279 0.23902742 -0.33018694
## GRADE_PERC 0.5377682 0.29129634 -0.34855880
## SHAPE_LENG -0.1112421 0.34777868 -0.09407790
# biplot
pcr = data.frame(pc.trail$rotation[,1:2])
ggplot()+geom_point(aes(PC1, PC2), size=0.2, data=data.frame(pc.trail$x, alpha = 0.7)) +
annotate(geom="segment", x=0, y=0, xend=pcr$PC1*10, yend=pcr$PC2*10, alpha = 0.9,
color = "palegreen4", size = 0.8) +
annotate(geom="text", x=pcr$PC1*11, y=pcr$PC2*11, label=rownames(pcr),
alpha=0.9, size =3.5, color = "palegreen4")
GRADE_TYPE and GRADE_PERC have the largest loading values, 0.56 and 0.54. They are both positive and very close in magnitude. So GRADE_PER should be highly correlated with GRADE_TYPE. Observations(trail segments) with higher PC1 scores tend to have higher GRADE_TYPE or GRADE_PERC values (in other words, steeper).
SURFACE TYPE and WIDTH have the largest loadings, 0.55 and 0.65. They are both positive and a little bit off in magnitude. But they are relatively more correlated with each other than with other variables. Trail segments with higher PC2 scores tend to have ‘higher’ surface type value (harder materials, more expensive and greater human impact) and larger width. SHAPE_LENGTH’s loading value is around 0.35 which is not very close to the ones for surface type and width. So I deem it not very important.
CONDITION and CANOPY have the largest loadings in magnitude, 0.70 (-0.698) and 0.50 (0.498). However, they have opposite signs. Thus, these variables are negatively correlated with each other. Trail segments with higher PC3 scores tend to have ‘higher’ canopy values (more tree canopies) and ‘lower’ condition values (worse trail condition).
## Try to find parks with similar trail features
## Form a new dataset for plot and clustering
# decide trail 'surface type' for each park by maximum total shape length
trails_surf = trails1 %>% group_by(PMA_NAME, SURFACE_TY) %>%
summarise(tot_length = sum(SHAPE_LENG)) %>%
filter(tot_length == max(tot_length))
# decide trail 'condition' for each park by maximum total shape length
trails_cond = trails1 %>% group_by(PMA_NAME, CONDITION) %>%
summarise(tot_length = sum(SHAPE_LENG))%>%
filter(tot_length == max(tot_length))
# decide trail 'width' for each park by maximum total shape length
trails_width = trails1 %>% group_by(PMA_NAME, WIDTH)%>%
summarise(tot_length = sum(SHAPE_LENG))%>%
filter(tot_length == max(tot_length))
# decide trail 'canopy' type for each park by maximum total shape length
trails_canopy = trails1 %>% group_by(PMA_NAME, CANOPY) %>%
summarise(tot_length = sum(SHAPE_LENG))%>%
filter(tot_length == max(tot_length))
# decide trail 'grade_type' for each park by maximum total shape length
trails_grade = trails1 %>% group_by(PMA_NAME, GRADE_TYPE) %>%
summarise(tot_length = sum(SHAPE_LENG)) %>%
filter(tot_length == max(tot_length))
# total length of trails in each park
trails_length = trails1 %>% group_by(PMA_NAME) %>%
summarise(TOT_LENGTH = sum(SHAPE_LENG))
# grade_perc
trails_perc = trails1 %>% group_by(PMA_NAME, GRADE_PERC) %>%
summarise(tot_length = sum(SHAPE_LENG)) %>%
filter(tot_length == max(tot_length))
# Form trails3 for clustering the parks
trails3 = bind_cols(trails_surf[,1:2], trails_cond[,2], trails_width[,2],
trails_canopy[,2], trails_grade[,2],
trails_perc[,2], trails_length[,2])
trails3 = as.data.frame(trails3)
row.names(trails3) = trails3$PMA_NAME
trails3 = trails3[,-1]
# matrix to store similarity values
park_trail_simi = matrix(0, nrow=nrow(trails3), ncol=nrow(trails3))
for (i in 1:nrow(trails3)) {
for (j in 1:nrow(trails3)) {
x = as.numeric(trails3[i,])
y = as.numeric(trails3[j,])
park_trail_simi[i,j] = cos.simi(x,y)
}
}
dimnames(park_trail_simi) = list(rownames(trails3), rownames(trails3))
# distances
D_park = as.dist(1 - park_trail_simi)
## hierarchical clustering and dendrogram
hc2 = hclust(D_park, method="complete")
ggdendrogram(hc2, rotate = TRUE, theme_dendro = FALSE) + geom_hline(aes(yintercept=2.7e-05), linetype = "dashed")
# PEPPI'S PLAYGROUND is isolated from the rest
# difficult to see other clusters
# remove PEPPI'S PLAYGROUND
trails4 = trails3[-46,]
# Recalculate the similarity matrix and distances
park_trail_simi1 = matrix(0, nrow=nrow(trails4), ncol=nrow(trails4))
for (i in 1:nrow(trails4)) {
for (j in 1:nrow(trails4)) {
x = as.numeric(trails4[i,])
y = as.numeric(trails4[j,])
park_trail_simi1[i,j] = cos.simi(x,y)
}
}
dimnames(park_trail_simi1) = list(rownames(trails4), rownames(trails4))
D_park1 = as.dist(1 - park_trail_simi1)
## hierarchical clustering and dendrogram
hc3 = hclust(D_park1, method="complete")
ggdendrogram(hc3, rotate = TRUE, theme_dendro = FALSE) +
geom_hline(yintercept = 2.69e-05, linetype = 'dashed')
# Visualize those parks on map
parks = rownames(trails4)
parks = paste(parks, 'Seattle,WA', sep=",")
# geocode the park locations
park_geoc = geocode(parks, override_limit = TRUE)
# if NA appears regeocode until all NAs are filled with values
park_geoc[which(is.na(park_geoc$lon)),] =
geocode(parks[which(is.na(park_geoc$lon))])
# store as text file in case need to regeocode (takes long time)
sink("park_geoc.txt")
write.table(park_geoc, quote=FALSE, row.names=FALSE,
col.names=TRUE, file="")
sink()
parkntrail = bind_cols(park = parks, park_geoc, trails4)
# Seattle_KingCT = get_googlemap('Seattle', zoom = 10, scale = 2,
# type = 'hybrid', override_limit = TRUE)
# base_map = ggmap(Seattle_KingCT)
# save(base_map, file = "basemap.RData")
load("basemap.RData") # in case some error appears when running get_googlemap
## Grade_Type
parkntrail$GRADE_TYPE = as.factor(parkntrail$GRADE_TYPE)
plot1 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, color = GRADE_TYPE), size = 2.5, alpha = 0.9,
data = parkntrail) + coord_fixed(1.6) +
scale_color_brewer(name = "GRADE", palette="RdYlGn", direction = -1)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Surface_Type
parkntrail$SURFACE_TY = as.factor(parkntrail$SURFACE_TY)
plot2 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, color = SURFACE_TY), size = 2.5, alpha = 0.9,
data = parkntrail) +
scale_color_manual(name = "SURFACE", values=wes_palette(n=3, name="Zissou1",
type = "continuous"))+ coord_fixed(1.6)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## WIDTH
plot3 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, size= WIDTH), color = 'steelblue4', alpha = 0.6,
data = parkntrail) + coord_fixed(1.6)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## CANOPY
parkntrail$CANOPY = as.factor(parkntrail$CANOPY)
plot4 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, color = CANOPY), size = 2.5, alpha = 0.9,
data = parkntrail) + coord_fixed(1.6) +
scale_color_brewer(palette="PiYG", direction = 1)
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## CONDITION
# Worn and erode -- 1, poor--2, overgrown--3, good--4
parkntrail$CONDITION = as.factor(parkntrail$CONDITION)
plot5 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, color = CONDITION), size = 2.5, alpha = 0.9,
data = parkntrail) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=5, name="Darjeeling1")[c(1, 4, 2)])
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
# Append the ggmap of CONDITION to each of the other trail attribute ggmaps
ggarrange(plot5, plot1, ncol = 2, nrow = 1, widths = c(1.08,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot5, plot2, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot5, plot3, ncol = 2, nrow = 1, widths = c(1.1,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot5, plot4, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
# Append the ggmap of CLUSTER to each of the other trail attribute ggmaps
# plot the CLUSTERs from clustering analysis
# Park clusterA: WEST DUWAMISH GREENBELT, SEOLA BEACH GREENBELT,
# THORNTON CDREEK PARKS #2
# Park clusterB: PEPPIS PLAYGROUND, HITT'S HILL PARK,
# LAKE PEOPLE PARK(XACUA'BS)
# Park clusterC: LICORICE FERN NA ON TC, CEDAR PARK, DEARBORN PARK,
# MARTHA WASHINGTON PARK, BHY KRACKE PARK, NORTH EAST
# QUEEN ANNE GREENBELT, LONGFELLOW CREEK GS:SOUTH,
# ST.MARKS GREENBELT, BOREN PARK, DUWAMISH HEAD GREENBELT
# Park clusterD: the rest of parks
parkntrail$park = rownames(trails4)
cluA = which(parkntrail$park %in%
c("WEST DUWAMISH GREENBELT", "SEOLA BEACH GREENBELT", "THORNTON CREEK PARK #2"))
cluB = which(parkntrail$park %in%
c("PEPPIS PLAYGROUND", "HITT'S HILL PARK", "LAKE PEOPLE PARK (XACUA'BS)"))
cluC = which(parkntrail$park %in%
c("LICORICE FERN NA ON TC", "CEDAR PARK", "DEARBORN PARK", "MARTHA WASHINGTON PARK",
"BHY KRACKE PARK", "NORTH EAST QUEEN ANNE GREENBELT", "LONGFELLOW CREEK GS: SOUTH",
"ST. MARKS GREENBELT", "BOREN PARK", "DUWAMISH HEAD GREENBELT"))
cluD = (1:nrow(parkntrail))[-c(cluA,cluB,cluC)]
parkntrail$CLUSTER = numeric(nrow(parkntrail))
parkntrail$CLUSTER[cluA] = "A"
parkntrail$CLUSTER[cluB] = "B"
parkntrail$CLUSTER[cluC] = "C"
parkntrail$CLUSTER[cluD] = "D"
## ClUSTERs on a map
parkntrail$CLUSTER = as.factor(parkntrail$CLUSTER)
plot6 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, color = CLUSTER), size = 2.5, alpha = 0.9,
data = parkntrail) + coord_fixed(1.6)+
scale_color_manual(values=c(wes_palette(n=5, name="BottleRocket2")[c(1,2)],
wes_palette(n=4, name="GrandBudapest1")[2],
wes_palette(n=5, name="BottleRocket2")[4]))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
plot6
## Warning: Removed 1 rows containing missing values (geom_rect).
Cluster-A parks are generally located in the southwest
Cluster-B parks are around central southeast
Cluster-C parks are found from north to south but many of them are not close to any shorelines
Cluster-D parks are everywhere and most of them are along the shorelines.
# Append the ggmap of CLUSTER to each of the other trail attribute ggmaps
ggarrange(plot6, plot1, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot6, plot2, ncol = 2, nrow = 1, widths = c(1,1), heights = c(1,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot6, plot3, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot6, plot4, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot6, plot5, ncol = 2, nrow = 1, widths = c(1,1), heights = c(1,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
Cluster-A parks: gradual or moderate slopes, natural earth surface, narrow trail width, have canopy and some with closed canopy, very bad trail condition.
Cluster-B parks: non-flat topography, hard and durable constructed surface, moderate width, open canopy, very good trail condition.
Cluster-C parks: mostly non-flat slopes, some have natural earth surface some have constructed surface, very few high or closed canopy, mostly good condition.
Cluster-D parks: a mixture of every type, generally flatter topography, constructed surface, larger width, mostly open canopy, some have good condition and some have the worst condition.
Note that cluster C and D are relatively hard to discriminate with each other as these two clusters got combined at relatively shorter distance. They have a lot in common. On the contrary, cluster A and cluster B are quite different except that neither of them have flat slopes. This makes sense as well since they have greater height of fusion, recall the dendrogram.
Some valuable information:
Cluster-A parks: WEST DUWAMISH GREENBELT, SEOLA BEACH GREENBELT and THORNTON CDREEK PARKS #2 need more attention for better maintenance or upgrade.
Cluster-B parks: PEPPIS PLAYGROUND, HITT’S HILL PARK and LAKE PEOPLE PARK(XACUA’BS) have good conditions and these can be explained by the corresponding features: flat ground(less likely to have erosion), durable surface types with proper width, no canopy (less likely to have overgrown, lower moisture).
May need to further group the parks in cluster D to seperate those with good condition trails and those with worse condition trails.
# take geographic location into consideration
trails4 = data.frame(trails4, park_geoc)
# Recalculate the similarity matrix and distances
park_trail_simi2 = matrix(0, nrow=nrow(trails4), ncol=nrow(trails4))
for (i in 1:nrow(trails4)) {
for (j in 1:nrow(trails4)) {
x = as.numeric(trails4[i,])
y = as.numeric(trails4[j,])
park_trail_simi2[i,j] = cos.simi(x,y)
}
}
dimnames(park_trail_simi2) = list(rownames(trails4), rownames(trails4))
D_park2 = as.dist(1 - park_trail_simi2)
## hierarchical clustering and dendrogram
hc4 = hclust(D_park2, method="complete")
ggdendrogram(hc4, rotate = TRUE, theme_dendro = FALSE) +
geom_hline(yintercept = 0.005, linetype = 'dashed')
# Park clusterE: WEST DUWAMISH GREENBELT
# Park clusterF: PEPPIS PLAYGROUND, HITT'S HILL PARK, LICORICE FERN NA ON TC,
# LAKE PEOPLE PARK(XACUA'BS), THORNTON CREEK PARK #2
# Park clusterG: DUWAMISH HEAD GREENBELT, BOREN PARK, CHEASTY GREENSPACE: MT VIEW,
# LONGFELLOW CREEK GS: NORTH, CHEASTY GREENSPACE, PRITCHARD ISLAND
# BEACH, LONGFELLOW CREEK GREENSPACE, SOLSTICE PARK, DEARBORN PARK,
# ST. MARKS GREENBELT, MARTHA WASHINGTON PARK, NORTH EAST QUEEN ANNE GREENBELT, BHY KRACKE PARK, LONGFELLOW CREEK GS: SOUTH, SW QUEEN ANNE GREENBELT, LAKEVIEW PARK, KIWANIS MEMORIAL PRESERVE PARK, CEDAR PARK, SEOLA BEACH GREENBELT
# Park clusterH: the rest of parks
cluE= which(parkntrail$park %in%
c("WEST DUWAMISH GREENBELT"))
cluF = which(parkntrail$park %in%
c("PEPPIS PLAYGROUND", "HITT'S HILL PARK", "LICORICE FERN NA ON TC",
"LAKE PEOPLE PARK (XACUA'BS)", "THORNTON CREEK PARK #2"))
cluG = which(parkntrail$park %in%
c("DUWAMISH HEAD GREENBELT", "BOREN PARK", "CHEASTY GREENSPACE: MT VIEW",
"LONGFELLOW CREEK GS: NORTH", "CHEASTY GREENSPACE", "PRITCHARD ISLAND BEACH",
"LONGFELLOW CREEK GREENSPACE", "SOLSTICE PARK", "DEARBORN PARK",
"ST. MARKS GREENBELT", "MARTHA WASHINGTON PARK",
"NORTH EAST QUEEN ANNE GREENBELT", "BHY KRACKE PARK",
"LONGFELLOW CREEK GS: SOUTH", "SW QUEEN ANNE GREENBELT",
"LAKEVIEW PARK", "KIWANIS MEMORIAL PRESERVE PARK", "CEDAR PARK",
"SEOLA BEACH GREENBELT"))
cluH = (1:nrow(parkntrail))[-c(cluE,cluF,cluG)]
parkntrail$CLUSTER2 = numeric(nrow(parkntrail))
parkntrail$CLUSTER2[cluE] = "E"
parkntrail$CLUSTER2[cluF] = "F"
parkntrail$CLUSTER2[cluG] = "G"
parkntrail$CLUSTER2[cluH] = "H"
## ClUSTERs on a map again
parkntrail$CLUSTER2 = as.factor(parkntrail$CLUSTER2)
plot7 = base_map + scale_x_continuous(limits = c(-122.6, -122.0), expand = c(0, 0)) +
scale_y_continuous(limits = c(47.4, 47.8), expand = c(0, 0)) +
geom_point(aes(x=lon, y=lat, color = CLUSTER2), size = 2.5, alpha = 0.9,
data = parkntrail) + coord_fixed(1.6)+
scale_color_manual(values=c(wes_palette(n=5, name="BottleRocket2")[c(1,2)],
wes_palette(n=4, name="GrandBudapest1")[2],
wes_palette(n=5, name="BottleRocket2")[4]))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
plot7
## Warning: Removed 1 rows containing missing values (geom_rect).
ggarrange(plot7, plot6, ncol = 2, nrow = 1, widths = c(1.05,1), heights = c(1.05,1))
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
# big 4-by-4 map matrix, need 16 datasets
# Open--1, low--2, high--3, Closed--4
# Flat--1, gradual--2, moderate--3, steep--4
# 1. open canopy -- flat
open_flat = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 1)
# 2. open canopy -- gradual
open_gradual = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 2)
# 3. open canopy -- moderate
open_moderate = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 3)
# 4. open canopy -- steep
open_steep = filter(parkntrail, CANOPY == 1, GRADE_TYPE == 4)
# 5. low canopy -- flat
low_flat = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 1)
# 6. low canopy -- gradual
low_gradual = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 2)
# 7. low canopy -- moderate
low_moderate = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 3)
# 8. low canopy -- steep
low_steep = filter(parkntrail, CANOPY == 2, GRADE_TYPE == 4)
# 9. high canopy -- flat
high_flat = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 1)
# 10. high canopy -- gradual
high_gradual = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 2)
# 11. high canopy -- moderate
high_moderate = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 3)
# 12. high canopy -- steep
high_steep = filter(parkntrail, CANOPY == 3, GRADE_TYPE == 4)
# 13. closed canopy -- flat
closed_flat = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 1)
# 14. closed canopy -- gradual
closed_gradual = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 2)
# 15. closed canopy -- moderate
closed_moderate = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 3)
# 16. closed canopy -- steep
closed_steep = filter(parkntrail, CANOPY == 4, GRADE_TYPE == 4)
## 16 ggmaps using the above 16 filtered dataframes
base_map1 = base_map + scale_x_continuous(limits = c(-122.5, -122.2),
expand = c(0, 0)) + scale_y_continuous(limits = c(47.475, 47.75),
expand = c(0, 0))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
open_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = open_flat) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])
open_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = open_gradual) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,4,2)])
open_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = open_moderate) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])
open_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = open_steep) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])
low_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = low_flat) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])
low_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = low_gradual) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])
low_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = low_moderate) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])
low_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = low_steep) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])
high_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = high_flat) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[2])
high_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = high_gradual) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])
high_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = high_moderate) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])
high_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = high_steep) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])
closed_flat_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = closed_flat) + coord_fixed(1.6)
closed_gradual_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = closed_gradual) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])
closed_moderate_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = closed_moderate) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[c(1,2)])
closed_steep_map = base_map1 + geom_point(aes(x=lon, y=lat, color = CONDITION),
size = 3, alpha = 0.9, data = closed_steep) + coord_fixed(1.6) +
scale_color_manual(values=wes_palette(n=4, name="Darjeeling1")[1])
## Align the maps as 4 by 4 map matrix
gg1 = ggarrange(closed_flat_map, closed_gradual_map, closed_moderate_map,
closed_steep_map, ncol = 4, nrow = 1, legend = "none")
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
gg2 = ggarrange(high_flat_map, high_gradual_map, high_moderate_map,
high_steep_map, ncol = 4, nrow = 1, legend = "none")
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
gg3 = ggarrange(low_flat_map, low_gradual_map, low_moderate_map,
low_steep_map, ncol = 4, nrow = 1, legend = "none")
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
gg4 = ggarrange(open_flat_map, open_gradual_map, open_moderate_map,
open_steep_map, ncol = 4, nrow = 1, legend = "none")
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 1 rows containing missing values (geom_rect).
fig = ggarrange(gg1, gg2, gg3, gg4, ncol = 1, nrow = 4)
annotate_figure(fig, top = text_grob("Park Trails Condition by Canopy and Grade Type",
color = "Black", size = 16),
left = text_grob("Canopy: open -- low -- high -- closed (from bottom to top)",
color = "Black", size = 14, rot = 90),
bottom = text_grob("Grade Type: flat -- gradual -- moderate -- steep
(from left to right)", color = "Black", size = 14))
Red(condition: 1, worn/eroded), orange(condition:2, poor), cyan(condition:4, good)
Most of the trails in Seattle are in open canopy environment and flat/gradual/moderate gade types.
Steeper trails are usually under greater tree canopy(probably in the forest with some mountains).
Open canopy, flatter topography is correlated to better trail condition.
Higher canopy coverage and steeper slope is correlated to worse trail condition.
Very few parks in Seattle area have trails under greater canopy coverage and on flatter land. Very few parks have trails under low canopy coverage, let alone low canopy coverage and more challenging grade types.